{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Intercept          25792.200199\n",
       "YearsExperience     9449.962321\n",
       "dtype: float64"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 导入第三方模块\n",
    "import pandas as pd\n",
    "import statsmodels.api as sm\n",
    "\n",
    "income = pd.read_csv('Salary_Data.csv')\n",
    "# 利用收入数据集，构建回归模型\n",
    "fit = sm.formula.ols('Salary ~ YearsExperience', data = income).fit()\n",
    "# 返回模型的参数值\n",
    "fit.params"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "模型的偏回归系数分别为：\n",
      " Intercept               58581.516503\n",
      "C(State)[T.Florida]       927.394424\n",
      "C(State)[T.New York]     -513.468310\n",
      "RD_Spend                    0.803487\n",
      "Administration             -0.057792\n",
      "Marketing_Spend             0.013779\n",
      "dtype: float64\n",
      "对比预测值和实际值的差异：\n",
      "        Prediction       Real\n",
      "8   150621.345801  152211.77\n",
      "48   55513.218079   35673.41\n",
      "14  150369.022458  132602.65\n",
      "42   74057.015562   71498.49\n",
      "29  103413.378282  101004.64\n",
      "44   67844.850378   65200.33\n",
      "4   173454.059691  166187.94\n",
      "31   99580.888894   97483.56\n",
      "13  128147.138396  134307.35\n",
      "18  130693.433835  124266.90\n"
     ]
    }
   ],
   "source": [
    "# 导入模块\n",
    "from sklearn import model_selection\n",
    "\n",
    "# 导入数据\n",
    "Profit = pd.read_excel(r'Predict to Profit.xlsx')\n",
    "# 将数据集拆分为训练集和测试集\n",
    "train, test = model_selection.train_test_split(Profit, test_size = 0.2, random_state=1234)\n",
    "# 根据train数据集建模\n",
    "model = sm.formula.ols('Profit ~ RD_Spend+Administration+Marketing_Spend+C(State)', data = train).fit()\n",
    "\n",
    "print('模型的偏回归系数分别为：\\n', model.params)\n",
    "# 删除test数据集中的Profit变量，用剩下的自变量进行预测\n",
    "test_X = test.drop(labels = 'Profit', axis = 1)\n",
    "pred = model.predict(exog = test_X)\n",
    "\n",
    "print('对比预测值和实际值的差异：\\n',pd.DataFrame({'Prediction':pred,'Real':test.Profit}))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "模型的偏回归系数分别为：\n",
      " Intercept          58068.048193\n",
      "RD_Spend               0.803487\n",
      "Administration        -0.057792\n",
      "Marketing_Spend        0.013779\n",
      "Florida             1440.862734\n",
      "California           513.468310\n",
      "dtype: float64\n"
     ]
    }
   ],
   "source": [
    "# 生成由State变量衍生的哑变量\n",
    "dummies = pd.get_dummies(Profit.State)\n",
    "# 将哑变量与原始数据集水平合并\n",
    "Profit_New = pd.concat([Profit,dummies], axis = 1)\n",
    "# 删除State变量和California变量（因为State变量已被分解为哑变量，New York变量需要作为参照组）\n",
    "Profit_New.drop(labels = ['State','New York'], axis = 1, inplace = True)\n",
    "# 拆分数据集Profit_New\n",
    "train, test = model_selection.train_test_split(Profit_New, test_size = 0.2, random_state=1234)\n",
    "# 建模\n",
    "model2 = sm.formula.ols('Profit~RD_Spend+Administration+Marketing_Spend+Florida+California', data = train).fit()\n",
    "print('模型的偏回归系数分别为：\\n', model2.params)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['YearsExperience', 'Salary'], dtype='object')"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "income.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9782416184887598"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "income.Salary.corr(income.YearsExperience)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RD_Spend           0.978437\n",
       "Administration     0.205841\n",
       "Marketing_Spend    0.739307\n",
       "Profit             1.000000\n",
       "dtype: float64"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Profit.drop('State',axis=1).corrwith(Profit['Profit'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>RD_Spend</th>\n",
       "      <th>Administration</th>\n",
       "      <th>Marketing_Spend</th>\n",
       "      <th>Profit</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>RD_Spend</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.243438</td>\n",
       "      <td>0.711654</td>\n",
       "      <td>0.978437</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Administration</td>\n",
       "      <td>0.243438</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>-0.037280</td>\n",
       "      <td>0.205841</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Marketing_Spend</td>\n",
       "      <td>0.711654</td>\n",
       "      <td>-0.037280</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.739307</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Profit</td>\n",
       "      <td>0.978437</td>\n",
       "      <td>0.205841</td>\n",
       "      <td>0.739307</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 RD_Spend  Administration  Marketing_Spend    Profit\n",
       "RD_Spend         1.000000        0.243438         0.711654  0.978437\n",
       "Administration   0.243438        1.000000        -0.037280  0.205841\n",
       "Marketing_Spend  0.711654       -0.037280         1.000000  0.739307\n",
       "Profit           0.978437        0.205841         0.739307  1.000000"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Profit.drop('State',axis=1).corr()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
